In [ ]:
# Display figures in iPython notebook
%pylab inline
In [5]:
from math import sqrt
import numpy as np
from numpy.random import rand
In [6]:
import pandas as pd
In [ ]:
pd.read_csv()
In [7]:
%cp ../03-InformationBased/car.data .
%cp ../03-InformationBased/car.names .
In [20]:
df = pd.read_csv("data/bank.csv", sep=';')
df.columns
Out[20]:
In [35]:
X = df[['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']].as_matrix()
In [41]:
def kNN_with_dataframes_majority(df, independent_cols, target_cols, query_df, k):
X = df[independent_cols].as_matrix()
Nsample = np.shape(X)[0]
Q = query_df[independent_cols].as_matrix()
for j in range(0, len(X)):
dis = distance(q, X[j])
d.append( ( dis, j) )
d = sorted(d, key=lambda tp: tp[0])
res = []
for i in range(0, k):
res.append(d[i][1])
return res ##, d[:k]
Out[41]:
In [45]:
np.shape(X)
Out[45]:
In [51]:
d = X[2] - np.array([23, 100, 89, 1, 232, 1])
In [57]:
d, d*d, sqrt(sum(d*d))
Out[57]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
## most primitive Euclidian distance
def distance(vect1, vect2):
s = 0
## se assume both vector have the same dimension
for i in range(0, len(vect1)):
delta = vect1[i]-vect2[i]
s += delta*delta
return math.sqrt(s)
distance([1, 2, 3], [-4, 2, 0])
In [ ]:
distance(rand(1000), rand(1000))
In [ ]:
## most primitive k-NN X=samples, q=query, k=number of neighbors
def kNN(X, q, k):
d = []
for j in range(0, len(X)):
dis = distance(q, X[j])
d.append( ( dis, j) )
d = sorted(d, key=lambda tp: tp[0])
res = []
for i in range(0, k):
res.append(d[i][1])
return res ##, d[:k]
In [ ]:
data = np.random.rand(1000,10)
q = np.random.rand(10)
kNN(data, q, 5)
In [ ]:
## let's try this out on a really big set
import time
data = np.random.randrand(10000,10000)
q = np.random.randrand(10000)
t0 = time.time()
kNN(data, q, 3)
t1 = time.time()
print("This took %.3f seconds" % (t1-t0))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
The following steps are from Zhitao's example:
In [ ]:
# Import matplotlib package to plot figures
import matplotlib.pyplot as plt
In [ ]:
# Import seaborn package to make figrures look better
import seaborn as sns
In [ ]:
# Import pandas package to store and manipulate data
import pandas as pd
In [ ]:
# Import numpy and scipy packages to do scientific analysis
import numpy as np
import scipy as sp
import scipy.stats
In [ ]:
# Import csv package to convert pandas dataframe to csv file
import csv
In [ ]:
# Import chain package to do iteration
from itertools import chain
In [ ]:
# Import Counter package to do counting
from collections import Counter, defaultdict
In [ ]:
# Import operator package to sort a dictionary by its values
import operator
In [ ]:
# Import re package to implement regular expression
import re
In [ ]:
# Import timer
import time
In [ ]:
# Import topic model packages
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import ldamodel;
from gensim import matutils;
In [ ]:
# Import NLTK package
from nltk import sent_tokenize,word_tokenize,porter
from nltk import PorterStemmer
from nltk.corpus import stopwords
In [ ]:
modified code from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html
In [ ]:
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Olivier Grisel <olivier.grisel@ensta.org>
# Mathieu Blondel <mathieu@mblondel.org>
# Lars Buitinck <L.J.Buitinck@uva.nl>
# License: BSD 3 clause
##from __future__ import print_function
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
#from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
#from sklearn.svm import LinearSVC
#from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import Perceptron
#from sklearn.linear_model import PassiveAggressiveClassifier
#from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
#from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
In [ ]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
class Option(object):
pass
opts = Option()
#"Print a detailed classification report."
opts.print_report = True
#"Select some number of features using a chi-squared test"
opts.select_chi2 = 3
#"Print the confusion matrix.")
opts.print_cm = True
#"Print ten most discriminative terms per class for every classifier."
opts.print_top10=False
#"Whether to use all categories or not."
opts.all_categories = True
#"Use a hashing vectorizer."
opts.use_hashing = True
#"n_features when using the hashing vectorizer."
opts.n_features = 2 ** 16
#"Remove newsgroup information that is easily overfit: headers, signatures, and quoting."
opts.filtered = True
In [ ]:
###############################################################################
# Load some categories from the training set
if opts.all_categories:
categories = None
else:
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
if opts.filtered:
remove = ('headers', 'footers', 'quotes')
else:
remove = ()
In [ ]:
print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42,
remove=remove)
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42,
remove=remove)
print('data loaded')
In [ ]:
data_train.data[3]
In [ ]:
data_train.target_names
In [ ]:
categories = data_train.target_names # for case categories == None
def size_mb(docs):
return sum(len(s.encode('utf-8')) for s in docs) / 1e6
data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)
print("%d documents - %0.3fMB (training set)" % (
len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
len(data_test.data), data_test_size_mb))
print("%d categories" % len(categories))
print()
In [ ]:
data_train.target_names
In [ ]:
In [ ]:
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target
print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
### use hashing?
if opts.use_hashing:
vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
n_features=opts.n_features)
X_train = vectorizer.transform(data_train.data)
else:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0
In [ ]:
shape(X_train), len(data_train.data)
In [ ]:
In [ ]:
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()
print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()
In [ ]:
# mapping from integer feature name to original token string
if opts.use_hashing:
feature_names = None
else:
feature_names = vectorizer.get_feature_names()
if opts.select_chi2:
print("Extracting %d best features by a chi-squared test" %
opts.select_chi2)
t0 = time()
ch2 = SelectKBest(chi2, k=opts.select_chi2)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)
if feature_names:
# keep selected feature names
feature_names = [feature_names[i] for i
in ch2.get_support(indices=True)]
print("done in %fs" % (time() - t0))
print()
if feature_names:
feature_names = np.asarray(feature_names)
def trim(s):
"""Trim string to fit on terminal (assuming 80-column display)"""
return s if len(s) <= 80 else s[:77] + "..."
In [ ]:
###############################################################################
# Benchmark classifiers
def benchmark(clf):
print('_' * 80)
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("test time: %0.3fs" % test_time)
score = metrics.accuracy_score(y_test, pred)
print("accuracy: %0.3f" % score)
if hasattr(clf, 'coef_'):
print("dimensionality: %d" % clf.coef_.shape[1])
print("density: %f" % density(clf.coef_))
if opts.print_top10 and feature_names is not None:
print("top 10 keywords per class:")
for i, category in enumerate(categories):
top10 = np.argsort(clf.coef_[i])[-10:]
print(trim("%s: %s"
% (category, " ".join(feature_names[top10]))))
print()
if opts.print_report:
print("classification report:")
print(metrics.classification_report(y_test, pred,
target_names=categories))
if opts.print_cm:
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))
print()
clf_descr = str(clf).split('(')[0]
return clf_descr, score, train_time, test_time
In [ ]:
results = benchmark(KNeighborsClassifier(n_neighbors=10))
In [ ]:
In [ ]:
In [ ]: